Introduction

FIFA 22 is a football simulation video game published by Electronic Arts as part of the FIFA series. It is the 29th installment in the FIFA sereis, and was released worldwide on 1 October 2021 for Microsoft Windows, Nintendo Switch, PlayStation 4, PlayStation 5, Xbox One, and Xbox Series X/S.

source: Wikipedia

Set-up & Packages

library(tidyverse)
library(magrittr)
library(DataExplorer)
library(maps)
library(plotly)
library(DT)
library(tidytext)
library(gridExtra)
library(factoextra)
library(kableExtra)
library(splitstackshape)
library(ggthemes)
library(data.table)
library(waffle)
library(knitr)

options(scipen = 999)

DATA

df <- read.csv("../db/players_22.csv", encoding = "UTF-8")
kable(t(head(df,3)), "html") %>% kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>% scroll_box(width = "100%")
1 2 3
sofifa_id 158023 188545 20801
player_url https://sofifa.com/player/158023/lionel-messi/220002 https://sofifa.com/player/188545/robert-lewandowski/220002 https://sofifa.com/player/20801/c-ronaldo-dos-santos-aveiro/220002
short_name L. Messi R. Lewandowski Cristiano Ronaldo
long_name Lionel Andrés Messi Cuccittini Robert Lewandowski Cristiano Ronaldo dos Santos Aveiro
player_positions RW, ST, CF ST ST, LW
overall 93 92 91
potential 93 92 91
value_eur 78000000 119500000 45000000
wage_eur 320000 270000 270000
age 34 32 36
dob 1987-06-24 1988-08-21 1985-02-05
height_cm 170 185 187
weight_kg 72 81 83
club_team_id 73 21 11
club_name Paris Saint-Germain FC Bayern München Manchester United
league_name French Ligue 1 German 1. Bundesliga English Premier League
league_level 1 1 1
club_position RW ST ST
club_jersey_number 30 9 7
club_loaned_from
club_joined 2021-08-10 2014-07-01 2021-08-27
club_contract_valid_until 2023 2023 2023
nationality_id 52 37 38
nationality_name Argentina Poland Portugal
nation_team_id 1369 1353 1354
nation_position RW RS ST
nation_jersey_number 10 9 7
preferred_foot Left Right Right
weak_foot 4 4 4
skill_moves 4 4 5
international_reputation 5 5 5
work_rate Medium/Low High/Medium High/Low
body_type Unique Unique Unique
real_face Yes Yes Yes
release_clause_eur 144300000 197200000 83300000
player_tags #Dribbler, #Distance Shooter, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Forward #Aerial Threat, #Distance Shooter, #Clinical Finisher, #Complete Forward #Aerial Threat, #Dribbler, #Distance Shooter, #Crosser, #Acrobat, #Clinical Finisher, #Complete Forward
player_traits Finesse Shot, Long Shot Taker (AI), Playmaker (AI), Outside Foot Shot, One Club Player, Chip Shot (AI), Technical Dribbler (AI) Solid Player, Finesse Shot, Outside Foot Shot, Chip Shot (AI) Power Free-Kick, Flair, Long Shot Taker (AI), Speed Dribbler (AI), Outside Foot Shot
pace 85 78 87
shooting 92 92 94
passing 91 79 80
dribbling 95 86 88
defending 34 44 34
physic 65 82 75
attacking_crossing 85 71 87
attacking_finishing 95 95 95
attacking_heading_accuracy 70 90 90
attacking_short_passing 91 85 80
attacking_volleys 88 89 86
skill_dribbling 96 85 88
skill_curve 93 79 81
skill_fk_accuracy 94 85 84
skill_long_passing 91 70 77
skill_ball_control 96 88 88
movement_acceleration 91 77 85
movement_sprint_speed 80 79 88
movement_agility 91 77 86
movement_reactions 94 93 94
movement_balance 95 82 74
power_shot_power 86 90 94
power_jumping 68 85 95
power_stamina 72 76 77
power_strength 69 86 77
power_long_shots 94 87 93
mentality_aggression 44 81 63
mentality_interceptions 40 49 29
mentality_positioning 93 95 95
mentality_vision 95 81 76
mentality_penalties 75 90 88
mentality_composure 96 88 95
defending_marking_awareness 20 35 24
defending_standing_tackle 35 42 32
defending_sliding_tackle 24 19 24
goalkeeping_diving 6 15 7
goalkeeping_handling 11 6 11
goalkeeping_kicking 15 12 15
goalkeeping_positioning 14 8 14
goalkeeping_reflexes 8 10 11
goalkeeping_speed NA NA NA
ls 89+3 90+2 90+1
st 89+3 90+2 90+1
rs 89+3 90+2 90+1
lw 92 85 88
lf 93 88 89
cf 93 88 89
rf 93 88 89
rw 92 85 88
lam 93 86+3 86+3
cam 93 86+3 86+3
ram 93 86+3 86+3
lm 91+2 84+3 86+3
lcm 87+3 80+3 78+3
cm 87+3 80+3 78+3
rcm 87+3 80+3 78+3
rm 91+2 84+3 86+3
lwb 66+3 64+3 63+3
ldm 64+3 66+3 59+3
cdm 64+3 66+3 59+3
rdm 64+3 66+3 59+3
rwb 66+3 64+3 63+3
lb 61+3 61+3 60+3
lcb 50+3 60+3 53+3
cb 50+3 60+3 53+3
rcb 50+3 60+3 53+3
rb 61+3 61+3 60+3
gk 19+3 19+3 20+3
player_face_url https://cdn.sofifa.net/players/158/023/22_120.png https://cdn.sofifa.net/players/188/545/22_120.png https://cdn.sofifa.net/players/020/801/22_120.png
club_logo_url https://cdn.sofifa.net/teams/73/60.png https://cdn.sofifa.net/teams/21/60.png https://cdn.sofifa.net/teams/11/60.png
club_flag_url https://cdn.sofifa.net/flags/fr.png https://cdn.sofifa.net/flags/de.png https://cdn.sofifa.net/flags/gb-eng.png
nation_logo_url https://cdn.sofifa.net/teams/1369/60.png https://cdn.sofifa.net/teams/1353/60.png https://cdn.sofifa.net/teams/1354/60.png
nation_flag_url https://cdn.sofifa.net/flags/ar.png https://cdn.sofifa.net/flags/pl.png https://cdn.sofifa.net/flags/pt.png

Ther are 110 columns and 19239 observations on the data.

dim(df)
## [1] 19239   110

Now we gonna check some more details about data with DataExplorer package.

# Check data structure with introduce()
kable(introduce(df), "html") %>% kable_styling(bootstrap_options = c("striped","hover", "condensed")) %>% scroll_box(width = "100%")
rows columns discrete_columns continuous_columns all_missing_columns total_missing_values complete_rows total_observations memory_usage
19239 110 50 60 0 68414 0 2116290 20813248
# Now we will plot it with plot_intro()
plot_intro(df, title = "Data Structure of database")

plot_missing(df, missing_only = TRUE, title = "Percentage of Missing Features")

Data Prep.

First, we need to deal with missing features in ours dataset.

Missing Values

Free agent player & retire player

Since free agent player didn’t play for any clubs. Therefore, some features (club_contract_valid_unitl, club_jersy_number, league_level, club_team_id, wage_eur, value_eur) will be null.

df %>% 
  filter(is.na(club_team_id) &
          is.na(club_jersey_number) &
          is.na(league_level) &
          is.na(club_team_id) &
          is.na(wage_eur) &
          is.na(value_eur)) %$%
  unique(club_name)
## [1] ""

However, we can see from graph above that there are some non-free agent player that didn’t have value. We also can assume that these old players value equal to 0.

df %>% 
  filter(is.na(value_eur) & club_name != "") %$%
  kable(head(.),"html") %>% 
  kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>% 
  scroll_box(width = "100%")
sofifa_id player_url short_name long_name player_positions overall potential value_eur wage_eur age dob height_cm weight_kg club_team_id club_name league_name league_level club_position club_jersey_number club_loaned_from club_joined club_contract_valid_until nationality_id nationality_name nation_team_id nation_position nation_jersey_number preferred_foot weak_foot skill_moves international_reputation work_rate body_type real_face release_clause_eur player_tags player_traits pace shooting passing dribbling defending physic attacking_crossing attacking_finishing attacking_heading_accuracy attacking_short_passing attacking_volleys skill_dribbling skill_curve skill_fk_accuracy skill_long_passing skill_ball_control movement_acceleration movement_sprint_speed movement_agility movement_reactions movement_balance power_shot_power power_jumping power_stamina power_strength power_long_shots mentality_aggression mentality_interceptions mentality_positioning mentality_vision mentality_penalties mentality_composure defending_marking_awareness defending_standing_tackle defending_sliding_tackle goalkeeping_diving goalkeeping_handling goalkeeping_kicking goalkeeping_positioning goalkeeping_reflexes goalkeeping_speed ls st rs lw lf cf rf rw lam cam ram lm lcm cm rcm rm lwb ldm cdm rdm rwb lb lcb cb rcb rb gk player_face_url club_logo_url club_flag_url nation_logo_url nation_flag_url
138703 https://sofifa.com/player/138703/andres-dalessandro/220002 A. D’Alessandro Andrés Nicolás D’Alessandro CAM, RM 75 75 NA 900 40 1981-04-15 174 69 111325 Club Nacional de Football Uruguayan Primera División 1 SUB 10 2021-01-04 2021 52 Argentina NA NA Left 3 4 1 Medium/Low Normal (170-185) No NA Flair, Long Passer (AI), Playmaker (AI), Technical Dribbler (AI) 42 71 80 75 38 50 79 69 45 80 68 77 82 80 77 80 45 39 51 73 82 69 45 30 50 76 75 42 73 85 80 78 46 29 25 6 11 12 15 6 NA 66+2 66+2 66+2 72 72 72 72 72 75 75 75 71+2 71+2 71+2 71+2 71+2 53+2 58+2 58+2 58+2 53+2 49+2 47+2 47+2 47+2 49+2 17+2 https://cdn.sofifa.net/players/138/703/22_120.png https://cdn.sofifa.net/teams/111325/60.png https://cdn.sofifa.net/flags/uy.png https://cdn.sofifa.net/flags/ar.png
152912 https://sofifa.com/player/152912/jose-sand/220002 J. Sand José Gustavo Sand ST 75 75 NA 13000 40 1980-07-17 182 79 110395 Club Atlético Lanús Argentina Primera División 1 LS 9 2019-01-21 2022 52 Argentina NA NA Right 4 3 1 High/Low Normal (170-185) No NA Solid Player, Finesse Shot, Team Player 51 79 62 68 37 70 53 81 77 69 77 67 63 63 46 75 49 53 38 81 67 77 71 41 83 73 73 33 83 69 85 85 45 21 25 11 16 15 9 14 NA 75 75 75 68 73 73 73 68 70+2 70+2 70+2 66+2 63+2 63+2 63+2 66+2 49+2 52+2 52+2 52+2 49+2 47+2 51+2 51+2 51+2 47+2 20+2 https://cdn.sofifa.net/players/152/912/22_120.png https://cdn.sofifa.net/teams/110395/60.png https://cdn.sofifa.net/flags/ar.png https://cdn.sofifa.net/flags/ar.png
110381 https://sofifa.com/player/110381/maximiliano-rodriguez/220002 M. Rodríguez Maximiliano Rubén Rodríguez LW, LM, ST 73 73 NA 10000 40 1981-01-02 173 74 110396 Newell’s Old Boys Argentina Primera División 1 SUB 11 2019-01-21 2021 52 Argentina NA NA Right 4 4 3 Medium/Low Normal (170-185) No NA Leadership, Team Player 64 74 76 75 44 64 72 73 61 78 71 75 76 71 75 76 64 64 72 72 70 77 67 46 70 75 73 47 76 76 77 86 57 30 26 15 11 15 9 9 NA 72+1 72+1 72+1 73 74-1 74-1 74-1 73 75-2 75-2 75-2 72+1 70+3 70+3 70+3 72+1 57+3 60+3 60+3 60+3 57+3 54+3 54+3 54+3 54+3 54+3 18+3 https://cdn.sofifa.net/players/110/381/22_120.png https://cdn.sofifa.net/teams/110396/60.png https://cdn.sofifa.net/flags/ar.png https://cdn.sofifa.net/flags/ar.png
115909 https://sofifa.com/player/115909/ruben-castro-martin/220002 Rubén Castro Rubén Castro Martín ST 69 69 NA 3000 40 1981-06-27 169 68 100851 FC Cartagena Spanish Segunda División 2 ST 7 2020-09-12 2022 45 Spain NA NA Right 3 3 2 Medium/Low Normal (170-) No NA 70 70 61 72 29 61 60 71 62 60 66 70 68 67 45 69 72 69 86 75 81 69 85 62 61 68 54 37 77 74 75 68 36 12 14 14 8 15 8 12 NA 69 69 69 70-1 71-2 71-2 71-2 70-1 69 69 69 67+2 61+2 61+2 61+2 67+2 50+2 47+2 47+2 47+2 50+2 47+2 43+2 43+2 43+2 47+2 18+2 https://cdn.sofifa.net/players/115/909/22_120.png https://cdn.sofifa.net/teams/100851/60.png https://cdn.sofifa.net/flags/es.png https://cdn.sofifa.net/flags/es.png
153066 https://sofifa.com/player/153066/lucas-licht/220002 L. Licht Lucas Matías Licht LB, LWB, LM 69 69 NA 5000 40 1981-04-06 174 72 101084 Gimnasia y Esgrima La Plata Argentina Primera División 1 SUB 25 2012-07-21 2021 52 Argentina NA NA Left 4 3 1 Medium/Medium Normal (170-185) No NA Leadership, Early Crosser, Team Player 64 61 65 72 68 66 80 52 53 52 57 76 73 68 71 69 59 68 69 66 73 71 72 64 66 68 69 71 56 64 84 72 69 71 69 8 14 6 15 8 NA 61+2 61+2 61+2 65 63 63 63 65 64+2 64+2 64+2 66+2 65+2 65+2 65+2 66+2 68+1 67+2 67+2 67+2 68+1 67+2 67+2 67+2 67+2 67+2 17+2 https://cdn.sofifa.net/players/153/066/22_120.png https://cdn.sofifa.net/teams/101084/60.png https://cdn.sofifa.net/flags/ar.png https://cdn.sofifa.net/flags/ar.png
124344 https://sofifa.com/player/124344/ibrahim-ozturk/220002 İ. Öztürk İbrahim Öztürk CB 67 67 NA 3000 40 1981-06-21 186 80 101006 Altay SK Turkish Süper Lig 1 CB 38 2017-08-24 2022 48 Turkey NA NA Right 2 2 1 Low/High Normal (185+) No NA Dives Into Tackles (AI), Team Player 32 43 48 48 68 74 44 30 72 56 52 46 44 48 42 54 30 34 32 68 38 66 59 65 78 46 78 66 34 44 51 51 70 68 62 15 13 10 10 6 NA 50+2 50+2 50+2 44 47 47 47 44 46+2 46+2 46+2 46+2 52+2 52+2 52+2 46+2 57+2 62+2 62+2 62+2 57+2 59+2 67 67 67 59+2 17+2 https://cdn.sofifa.net/players/124/344/22_120.png https://cdn.sofifa.net/teams/101006/60.png https://cdn.sofifa.net/flags/tr.png https://cdn.sofifa.net/flags/tr.png

Fill the missing values

# Fill the missing values
df$club_contract_valid_until[is.na(df$club_contract_valid_until)] <- 0
df$club_jersey_number[is.na(df$club_jersey_number)] <- 0

# league_level is ordinal variable which 1 is the highest league and 5 is lowest league
# Since, these player are free agent and didn't play in any league at the moment so we assign 6 to them
df$league_level[is.na(df$league_level)] <- 6

# club_team_id run from 1 to 115820. we will assing 0 to the free agent
df$club_team_id[is.na(df$club_team_id)] <- 0

df$wage_eur[is.na(df$wage_eur)] <- 0
df$value_eur[is.na(df$value_eur)] <- 0
plot_missing(df, missing_only = TRUE, title = "Percentage of Missing Features")

Release Clause

Not all player have release clause. So, it natural to have some missng release_clause_eur.

df %>% 
  filter(is.na(release_clause_eur)) %$%
  kable(head(.),"html") %>% 
  kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>% 
  scroll_box(width = "100%")
sofifa_id player_url short_name long_name player_positions overall potential value_eur wage_eur age dob height_cm weight_kg club_team_id club_name league_name league_level club_position club_jersey_number club_loaned_from club_joined club_contract_valid_until nationality_id nationality_name nation_team_id nation_position nation_jersey_number preferred_foot weak_foot skill_moves international_reputation work_rate body_type real_face release_clause_eur player_tags player_traits pace shooting passing dribbling defending physic attacking_crossing attacking_finishing attacking_heading_accuracy attacking_short_passing attacking_volleys skill_dribbling skill_curve skill_fk_accuracy skill_long_passing skill_ball_control movement_acceleration movement_sprint_speed movement_agility movement_reactions movement_balance power_shot_power power_jumping power_stamina power_strength power_long_shots mentality_aggression mentality_interceptions mentality_positioning mentality_vision mentality_penalties mentality_composure defending_marking_awareness defending_standing_tackle defending_sliding_tackle goalkeeping_diving goalkeeping_handling goalkeeping_kicking goalkeeping_positioning goalkeeping_reflexes goalkeeping_speed ls st rs lw lf cf rf rw lam cam ram lm lcm cm rcm rm lwb ldm cdm rdm rwb lb lcb cb rcb rb gk player_face_url club_logo_url club_flag_url nation_logo_url nation_flag_url
194765 https://sofifa.com/player/194765/antoine-griezmann/220002 A. Griezmann Antoine Griezmann ST, LW, RW 85 85 53000000 220000 30 1991-03-21 176 73 240 Atlético de Madrid Spain Primera Division 1 LS 17 FC Barcelona 2022 18 France 1335 RW 7 Left 3 4 4 Medium/Medium Unique Yes NA #Acrobat Finesse Shot, Flair, Long Shot Taker (AI), Playmaker (AI), Outside Foot Shot, Chip Shot (AI), Technical Dribbler (AI) 80 84 84 87 52 72 83 84 83 84 86 85 86 85 82 89 80 80 92 89 83 82 90 86 63 83 73 49 89 85 79 90 43 54 49 14 8 14 13 14 NA 84+1 84+1 84+1 85 86-1 86-1 86-1 85 86-1 86-1 86-1 85 82+3 82+3 82+3 85 71+3 69+3 69+3 69+3 71+3 69+3 63+3 63+3 63+3 69+3 21+3 https://cdn.sofifa.net/players/194/765/22_120.png https://cdn.sofifa.net/teams/240/60.png https://cdn.sofifa.net/flags/es.png https://cdn.sofifa.net/teams/1335/60.png https://cdn.sofifa.net/flags/fr.png
184087 https://sofifa.com/player/184087/toby-alderweireld/220002 T. Alderweireld Toby Alderweireld CB 83 83 0 0 32 1989-03-02 186 81 0 6 0 0 7 Belgium 1325 RCB 2 Right 3 2 3 Medium/Medium Normal (185+) Yes NA Long Passer (AI) 58 55 70 67 86 77 64 45 81 77 38 62 63 59 81 75 55 60 54 85 62 78 81 76 77 58 79 85 52 62 58 86 87 87 84 16 6 14 16 14 NA 65+3 65+3 65+3 63 65 65 65 63 66+3 66+3 66+3 66+3 73+3 73+3 73+3 66+3 77+3 81+2 81+2 81+2 77+3 78+3 82+1 82+1 82+1 78+3 21+3 https://cdn.sofifa.net/players/184/087/22_120.png https://cdn.sofifa.net/teams/1325/60.png https://cdn.sofifa.net/flags/be.png
201153 https://sofifa.com/player/201153/alvaro-borja-morata-martin/220002 Morata Álvaro Borja Morata Martín ST 83 83 37000000 84000 28 1992-10-23 190 84 45 Juventus Italian Serie A 1 ST 9 Atlético de Madrid 2022 45 Spain 1362 SUB 7 Right 4 3 3 Medium/Medium Unique Yes NA Speed Dribbler (AI) 82 80 72 81 31 77 72 84 86 78 80 83 78 44 60 83 79 85 72 80 63 80 84 77 80 72 69 24 87 77 75 79 37 14 20 4 5 4 4 5 NA 83 83 83 81 82 82 82 81 80+3 80+3 80+3 79+3 71+3 71+3 71+3 79+3 58+3 55+3 55+3 55+3 58+3 54+3 50+3 50+3 50+3 54+3 13+3 https://cdn.sofifa.net/players/201/153/22_120.png https://cdn.sofifa.net/teams/45/60.png https://cdn.sofifa.net/flags/it.png https://cdn.sofifa.net/teams/1362/60.png https://cdn.sofifa.net/flags/es.png
235805 https://sofifa.com/player/235805/federico-chiesa/220002 F. Chiesa Federico Chiesa RW, LW, RM 83 91 80500000 74000 23 1997-10-25 175 70 45 Juventus Italian Serie A 1 LM 22 Fiorentina 2022 27 Italy 1343 RW 14 Right 4 4 3 High/Medium Normal (170-185) Yes NA #Speedster, #Dribbler, #Acrobat Long Shot Taker (AI), Speed Dribbler (AI) 91 81 74 85 48 73 73 79 50 78 80 89 78 52 72 82 91 91 87 83 81 86 53 85 71 84 69 30 81 75 62 78 65 44 44 6 7 8 9 7 NA 79+3 79+3 79+3 83 82 82 82 83 82+3 82+3 82+3 82+3 75+3 75+3 75+3 82+3 67+3 64+3 64+3 64+3 67+3 64+3 57+3 57+3 57+3 64+3 16+3 https://cdn.sofifa.net/players/235/805/22_120.png https://cdn.sofifa.net/teams/45/60.png https://cdn.sofifa.net/flags/it.png https://cdn.sofifa.net/teams/1343/60.png https://cdn.sofifa.net/flags/it.png
180206 https://sofifa.com/player/180206/miralem-pjanic/220002 M. Pjanić Miralem Pjanić CM 82 82 25000000 155000 31 1990-04-02 178 72 327 Beşiktaş JK Turkish Süper Lig 1 RCM 15 FC Barcelona 2022 8 Bosnia and Herzegovina NA NA Right 4 3 3 Medium/Medium Normal (170-185) Yes NA #FK Specialist Finesse Shot, Playmaker (AI), Outside Foot Shot, Technical Dribbler (AI) 65 68 83 81 75 67 80 56 60 84 72 81 86 92 82 83 67 64 74 83 80 78 59 79 60 80 70 78 68 84 79 84 78 77 71 7 7 13 7 8 NA 70+3 70+3 70+3 76 75 75 75 76 78+3 78+3 78+3 77+3 81+1 81+1 81+1 77+3 78+3 79+3 79+3 79+3 78+3 76+3 73+3 73+3 73+3 76+3 16+3 https://cdn.sofifa.net/players/180/206/22_120.png https://cdn.sofifa.net/teams/327/60.png https://cdn.sofifa.net/flags/tr.png https://cdn.sofifa.net/flags/ba.png
193105 https://sofifa.com/player/193105/alphonse-areola/220002 A. Areola Alphonse Areola GK 82 84 26000000 75000 28 1993-02-27 195 94 19 West Ham United English Premier League 1 SUB 13 Paris Saint-Germain 2022 18 France NA NA Right 3 1 2 Medium/Medium Unique Yes NA Comes For Crosses NA NA NA NA NA NA 20 19 14 48 16 15 16 16 37 22 56 54 58 78 58 57 72 38 80 14 26 23 17 51 25 64 13 18 12 85 79 76 80 85 55 34+2 34+2 34+2 33 34 34 34 33 36+2 36+2 36+2 35+2 36+2 36+2 36+2 35+2 32+2 34+2 34+2 34+2 32+2 31+2 31+2 31+2 31+2 31+2 81+2 https://cdn.sofifa.net/players/193/105/22_120.png https://cdn.sofifa.net/teams/19/60.png https://cdn.sofifa.net/flags/gb-eng.png https://cdn.sofifa.net/flags/fr.png

The player with missing release_clause_eur are player whom didn’t have release clause so it equal to 0.

df$release_clause_eur[is.na(df$release_clause_eur)] <- 0
plot_missing(df, missing_only = TRUE, title = "Percentage of Missing Features")

Goalkeeping player

From the data we can see that there are 11.08% of player which missing basic features such as physic, defending, dribbling, passing, shooting, and pace. On the other hand, there are 88.92% of player who missing goalkeeping_speed. If we combine these two numbers we will got 100%. Therefore, There might be two group of player Goalkeeper and Non-Goalkeeper.

# Check non-goalkeeper group
df %>% 
  filter(is.na(goalkeeping_speed)) %$%
  unique(club_position)
##  [1] "RW"  "ST"  "LW"  "RCM" "CF"  "CDM" "LCB" "RDM" "RS"  "LCM" "SUB" "CAM"
## [13] "RCB" "LDM" "LB"  "RB"  "LM"  "RM"  "LS"  "CB"  "RES" ""    "RWB" "RF" 
## [25] "CM"  "LWB" "LAM" "LF"  "RAM"

As we expect these group of players are Non-Goalkeeper. we can fill goalkeeping_speed with 0 since it not related with their position.

df$goalkeeping_speed[is.na(df$goalkeeping_speed)] <- 0
# Check goalkeeper group
df %>% 
  filter(is.na(physic) &
         is.na(defending) &
         is.na(dribbling) &
         is.na(passing) &
         is.na(shooting)& 
         is.na(pace)) %$% 
  unique(club_position)
## [1] "GK"  "SUB" "RES" ""

Apparently there are not only GK in these group need keep investigating.

df %>% 
  filter(is.na(physic) &
         is.na(defending) &
         is.na(dribbling) &
         is.na(passing) &
         is.na(shooting)& 
         is.na(pace)) %>% 
  filter(club_position %in% c("SUB", "RES", "")) %$%
  unique(player_positions)
## [1] "GK"

After more investigation the player with club_position equal to SUB, RES, and "" in second group also a Goalkeeper. and we can fill these features with 0.

df$physic[is.na(df$physic)] <- 0
df$defending[is.na(df$defending)] <- 0
df$dribbling[is.na(df$dribbling)] <- 0
df$passing[is.na(df$passing)] <- 0
df$shooting[is.na(df$shooting)] <-0
df$pace[is.na(df$pace)] <- 0
plot_missing(df, missing_only = TRUE, title = "Percentage of Missing Features")

National Team player

The missing value of nation_jersey_number and nation_team_id is a majority of player who not selected in their national team. we will fill this with 0.

df$nation_jersey_number[is.na(df$nation_jersey_number)] <- 0
df$nation_team_id[is.na(df$nation_team_id)] <- 0
plot_intro(df, title = "Data Structure of database")

With this we deal with all missing value and reach tidydata.

# save df into .Rdata format
save(df, file = "../output/fifa_22_tidydata_raw.Rdata")
list.files(path = "../output")
## [1] "FIFA_22_Analysis_files"         "FIFA_22_Analysis.html"         
## [3] "fifa_22_tidydata_cleaned.Rdata" "fifa_22_tidydata_raw.Rdata"

Data Manipulation

Unnecessary Features

df %<>% select(-player_url, -long_name, -real_face, -ls, -st, -rs, -lw, -lf, -cf, -rf, -rw, -lam, -cam, -ram, -lm, -lcm, -cm, -rcm, -rm, -lwb, -ldm, -cdm, -rdm, -rwb, -lb, -lcb, -cb, -rcb, -rb, -gk)
kable(t(head(df)), "html") %>% kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>% scroll_box(width = "100%")
1 2 3 4 5 6
sofifa_id 158023 188545 20801 190871 192985 200389
short_name L. Messi R. Lewandowski Cristiano Ronaldo Neymar Jr K. De Bruyne J. Oblak
player_positions RW, ST, CF ST ST, LW LW, CAM CM, CAM GK
overall 93 92 91 91 91 91
potential 93 92 91 91 91 93
value_eur 78000000 119500000 45000000 129000000 125500000 112000000
wage_eur 320000 270000 270000 270000 350000 130000
age 34 32 36 29 30 28
dob 1987-06-24 1988-08-21 1985-02-05 1992-02-05 1991-06-28 1993-01-07
height_cm 170 185 187 175 181 188
weight_kg 72 81 83 68 70 87
club_team_id 73 21 11 73 10 240
club_name Paris Saint-Germain FC Bayern München Manchester United Paris Saint-Germain Manchester City Atlético de Madrid
league_name French Ligue 1 German 1. Bundesliga English Premier League French Ligue 1 English Premier League Spain Primera Division
league_level 1 1 1 1 1 1
club_position RW ST ST LW RCM GK
club_jersey_number 30 9 7 10 17 13
club_loaned_from
club_joined 2021-08-10 2014-07-01 2021-08-27 2017-08-03 2015-08-30 2014-07-16
club_contract_valid_until 2023 2023 2023 2025 2025 2023
nationality_id 52 37 38 54 7 44
nationality_name Argentina Poland Portugal Brazil Belgium Slovenia
nation_team_id 1369 1353 1354 0 1325 0
nation_position RW RS ST RCM
nation_jersey_number 10 9 7 0 7 0
preferred_foot Left Right Right Right Right Right
weak_foot 4 4 4 5 5 3
skill_moves 4 4 5 5 4 1
international_reputation 5 5 5 5 4 5
work_rate Medium/Low High/Medium High/Low High/Medium High/High Medium/Medium
body_type Unique Unique Unique Unique Unique Unique
release_clause_eur 144300000 197200000 83300000 238700000 232200000 238000000
player_tags #Dribbler, #Distance Shooter, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Forward #Aerial Threat, #Distance Shooter, #Clinical Finisher, #Complete Forward #Aerial Threat, #Dribbler, #Distance Shooter, #Crosser, #Acrobat, #Clinical Finisher, #Complete Forward #Speedster, #Dribbler, #Playmaker, #FK Specialist, #Acrobat, #Complete Midfielder #Dribbler, #Playmaker, #Engine, #Distance Shooter, #Crosser, #Complete Midfielder
player_traits Finesse Shot, Long Shot Taker (AI), Playmaker (AI), Outside Foot Shot, One Club Player, Chip Shot (AI), Technical Dribbler (AI) Solid Player, Finesse Shot, Outside Foot Shot, Chip Shot (AI) Power Free-Kick, Flair, Long Shot Taker (AI), Speed Dribbler (AI), Outside Foot Shot Injury Prone, Flair, Speed Dribbler (AI), Playmaker (AI), Outside Foot Shot, Technical Dribbler (AI) Injury Prone, Leadership, Early Crosser, Long Passer (AI), Long Shot Taker (AI), Playmaker (AI), Outside Foot Shot GK Long Throw, Comes For Crosses
pace 85 78 87 91 76 0
shooting 92 92 94 83 86 0
passing 91 79 80 86 93 0
dribbling 95 86 88 94 88 0
defending 34 44 34 37 64 0
physic 65 82 75 63 78 0
attacking_crossing 85 71 87 85 94 13
attacking_finishing 95 95 95 83 82 11
attacking_heading_accuracy 70 90 90 63 55 15
attacking_short_passing 91 85 80 86 94 43
attacking_volleys 88 89 86 86 82 13
skill_dribbling 96 85 88 95 88 12
skill_curve 93 79 81 88 85 13
skill_fk_accuracy 94 85 84 87 83 14
skill_long_passing 91 70 77 81 93 40
skill_ball_control 96 88 88 95 91 30
movement_acceleration 91 77 85 93 76 43
movement_sprint_speed 80 79 88 89 76 60
movement_agility 91 77 86 96 79 67
movement_reactions 94 93 94 89 91 88
movement_balance 95 82 74 84 78 49
power_shot_power 86 90 94 80 91 59
power_jumping 68 85 95 64 63 78
power_stamina 72 76 77 81 89 41
power_strength 69 86 77 53 74 78
power_long_shots 94 87 93 81 91 12
mentality_aggression 44 81 63 63 76 34
mentality_interceptions 40 49 29 37 66 19
mentality_positioning 93 95 95 86 88 11
mentality_vision 95 81 76 90 94 65
mentality_penalties 75 90 88 93 83 11
mentality_composure 96 88 95 93 89 68
defending_marking_awareness 20 35 24 35 68 27
defending_standing_tackle 35 42 32 32 65 12
defending_sliding_tackle 24 19 24 29 53 18
goalkeeping_diving 6 15 7 9 15 87
goalkeeping_handling 11 6 11 9 13 92
goalkeeping_kicking 15 12 15 15 5 78
goalkeeping_positioning 14 8 14 15 10 90
goalkeeping_reflexes 8 10 11 11 13 90
goalkeeping_speed 0 0 0 0 0 50
player_face_url https://cdn.sofifa.net/players/158/023/22_120.png https://cdn.sofifa.net/players/188/545/22_120.png https://cdn.sofifa.net/players/020/801/22_120.png https://cdn.sofifa.net/players/190/871/22_120.png https://cdn.sofifa.net/players/192/985/22_120.png https://cdn.sofifa.net/players/200/389/22_120.png
club_logo_url https://cdn.sofifa.net/teams/73/60.png https://cdn.sofifa.net/teams/21/60.png https://cdn.sofifa.net/teams/11/60.png https://cdn.sofifa.net/teams/73/60.png https://cdn.sofifa.net/teams/10/60.png https://cdn.sofifa.net/teams/240/60.png
club_flag_url https://cdn.sofifa.net/flags/fr.png https://cdn.sofifa.net/flags/de.png https://cdn.sofifa.net/flags/gb-eng.png https://cdn.sofifa.net/flags/fr.png https://cdn.sofifa.net/flags/gb-eng.png https://cdn.sofifa.net/flags/es.png
nation_logo_url https://cdn.sofifa.net/teams/1369/60.png https://cdn.sofifa.net/teams/1353/60.png https://cdn.sofifa.net/teams/1354/60.png https://cdn.sofifa.net/teams/1325/60.png
nation_flag_url https://cdn.sofifa.net/flags/ar.png https://cdn.sofifa.net/flags/pl.png https://cdn.sofifa.net/flags/pt.png https://cdn.sofifa.net/flags/br.png https://cdn.sofifa.net/flags/be.png https://cdn.sofifa.net/flags/si.png

Stacked columns

work_rate, player_tags, and player_traits are stacked columns. This mean in each columns there might be more than one values in it.

# Unstacked work_rate

df <- df %$%
  cSplit(., 'work_rate', sep= "/", type.convert = FALSE) %>% 
  rename(
    work_rate_att = work_rate_1,
    work_rate_dff = work_rate_2 )
kable(t(head(df,3)), "html") %>% kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>% scroll_box(width = "100%")
sofifa_id 158023 188545 20801
short_name L. Messi R. Lewandowski Cristiano Ronaldo
player_positions RW, ST, CF ST ST, LW
overall 93 92 91
potential 93 92 91
value_eur 78000000 119500000 45000000
wage_eur 320000 270000 270000
age 34 32 36
dob 1987-06-24 1988-08-21 1985-02-05
height_cm 170 185 187
weight_kg 72 81 83
club_team_id 73 21 11
club_name Paris Saint-Germain FC Bayern München Manchester United
league_name French Ligue 1 German 1. Bundesliga English Premier League
league_level 1 1 1
club_position RW ST ST
club_jersey_number 30 9 7
club_loaned_from
club_joined 2021-08-10 2014-07-01 2021-08-27
club_contract_valid_until 2023 2023 2023
nationality_id 52 37 38
nationality_name Argentina Poland Portugal
nation_team_id 1369 1353 1354
nation_position RW RS ST
nation_jersey_number 10 9 7
preferred_foot Left Right Right
weak_foot 4 4 4
skill_moves 4 4 5
international_reputation 5 5 5
body_type Unique Unique Unique
release_clause_eur 144300000 197200000 83300000
player_tags #Dribbler, #Distance Shooter, #FK Specialist, #Acrobat, #Clinical Finisher, #Complete Forward #Aerial Threat, #Distance Shooter, #Clinical Finisher, #Complete Forward #Aerial Threat, #Dribbler, #Distance Shooter, #Crosser, #Acrobat, #Clinical Finisher, #Complete Forward
player_traits Finesse Shot, Long Shot Taker (AI), Playmaker (AI), Outside Foot Shot, One Club Player, Chip Shot (AI), Technical Dribbler (AI) Solid Player, Finesse Shot, Outside Foot Shot, Chip Shot (AI) Power Free-Kick, Flair, Long Shot Taker (AI), Speed Dribbler (AI), Outside Foot Shot
pace 85 78 87
shooting 92 92 94
passing 91 79 80
dribbling 95 86 88
defending 34 44 34
physic 65 82 75
attacking_crossing 85 71 87
attacking_finishing 95 95 95
attacking_heading_accuracy 70 90 90
attacking_short_passing 91 85 80
attacking_volleys 88 89 86
skill_dribbling 96 85 88
skill_curve 93 79 81
skill_fk_accuracy 94 85 84
skill_long_passing 91 70 77
skill_ball_control 96 88 88
movement_acceleration 91 77 85
movement_sprint_speed 80 79 88
movement_agility 91 77 86
movement_reactions 94 93 94
movement_balance 95 82 74
power_shot_power 86 90 94
power_jumping 68 85 95
power_stamina 72 76 77
power_strength 69 86 77
power_long_shots 94 87 93
mentality_aggression 44 81 63
mentality_interceptions 40 49 29
mentality_positioning 93 95 95
mentality_vision 95 81 76
mentality_penalties 75 90 88
mentality_composure 96 88 95
defending_marking_awareness 20 35 24
defending_standing_tackle 35 42 32
defending_sliding_tackle 24 19 24
goalkeeping_diving 6 15 7
goalkeeping_handling 11 6 11
goalkeeping_kicking 15 12 15
goalkeeping_positioning 14 8 14
goalkeeping_reflexes 8 10 11
goalkeeping_speed 0 0 0
player_face_url https://cdn.sofifa.net/players/158/023/22_120.png https://cdn.sofifa.net/players/188/545/22_120.png https://cdn.sofifa.net/players/020/801/22_120.png
club_logo_url https://cdn.sofifa.net/teams/73/60.png https://cdn.sofifa.net/teams/21/60.png https://cdn.sofifa.net/teams/11/60.png
club_flag_url https://cdn.sofifa.net/flags/fr.png https://cdn.sofifa.net/flags/de.png https://cdn.sofifa.net/flags/gb-eng.png
nation_logo_url https://cdn.sofifa.net/teams/1369/60.png https://cdn.sofifa.net/teams/1353/60.png https://cdn.sofifa.net/teams/1354/60.png
nation_flag_url https://cdn.sofifa.net/flags/ar.png https://cdn.sofifa.net/flags/pl.png https://cdn.sofifa.net/flags/pt.png
work_rate_att Medium High High
work_rate_dff Low Medium Low

for feature like player_tags, and player_traits we need to one-hot encode them.

one_hot <- function(x) {
  map_df(x, table) %>% 
    mutate_all(as.integer) %>% 
    mutate_all(replace_na, 0L)
}
df$player_traits <- str_remove_all(df$player_traits, "\\(AI\\)")

df$player_tags[df$player_tags == ""] <- "#No_Tags"
df$player_traits[df$player_traits == ""] <- "No Traits"


df %<>% 
  mutate(one_hot(strsplit(player_tags, ", "))) %>% 
  mutate(one_hot(strsplit(player_traits, ", "))) %>% 
  select(-player_tags, -player_traits) 

df %$% 
  kable(t(head(.,3)), "html") %>% 
  kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>% 
  scroll_box(width = "100%")
sofifa_id 158023 188545 20801
short_name L. Messi R. Lewandowski Cristiano Ronaldo
player_positions RW, ST, CF ST ST, LW
overall 93 92 91
potential 93 92 91
value_eur 78000000 119500000 45000000
wage_eur 320000 270000 270000
age 34 32 36
dob 1987-06-24 1988-08-21 1985-02-05
height_cm 170 185 187
weight_kg 72 81 83
club_team_id 73 21 11
club_name Paris Saint-Germain FC Bayern München Manchester United
league_name French Ligue 1 German 1. Bundesliga English Premier League
league_level 1 1 1
club_position RW ST ST
club_jersey_number 30 9 7
club_loaned_from
club_joined 2021-08-10 2014-07-01 2021-08-27
club_contract_valid_until 2023 2023 2023
nationality_id 52 37 38
nationality_name Argentina Poland Portugal
nation_team_id 1369 1353 1354
nation_position RW RS ST
nation_jersey_number 10 9 7
preferred_foot Left Right Right
weak_foot 4 4 4
skill_moves 4 4 5
international_reputation 5 5 5
body_type Unique Unique Unique
release_clause_eur 144300000 197200000 83300000
pace 85 78 87
shooting 92 92 94
passing 91 79 80
dribbling 95 86 88
defending 34 44 34
physic 65 82 75
attacking_crossing 85 71 87
attacking_finishing 95 95 95
attacking_heading_accuracy 70 90 90
attacking_short_passing 91 85 80
attacking_volleys 88 89 86
skill_dribbling 96 85 88
skill_curve 93 79 81
skill_fk_accuracy 94 85 84
skill_long_passing 91 70 77
skill_ball_control 96 88 88
movement_acceleration 91 77 85
movement_sprint_speed 80 79 88
movement_agility 91 77 86
movement_reactions 94 93 94
movement_balance 95 82 74
power_shot_power 86 90 94
power_jumping 68 85 95
power_stamina 72 76 77
power_strength 69 86 77
power_long_shots 94 87 93
mentality_aggression 44 81 63
mentality_interceptions 40 49 29
mentality_positioning 93 95 95
mentality_vision 95 81 76
mentality_penalties 75 90 88
mentality_composure 96 88 95
defending_marking_awareness 20 35 24
defending_standing_tackle 35 42 32
defending_sliding_tackle 24 19 24
goalkeeping_diving 6 15 7
goalkeeping_handling 11 6 11
goalkeeping_kicking 15 12 15
goalkeeping_positioning 14 8 14
goalkeeping_reflexes 8 10 11
goalkeeping_speed 0 0 0
player_face_url https://cdn.sofifa.net/players/158/023/22_120.png https://cdn.sofifa.net/players/188/545/22_120.png https://cdn.sofifa.net/players/020/801/22_120.png
club_logo_url https://cdn.sofifa.net/teams/73/60.png https://cdn.sofifa.net/teams/21/60.png https://cdn.sofifa.net/teams/11/60.png
club_flag_url https://cdn.sofifa.net/flags/fr.png https://cdn.sofifa.net/flags/de.png https://cdn.sofifa.net/flags/gb-eng.png
nation_logo_url https://cdn.sofifa.net/teams/1369/60.png https://cdn.sofifa.net/teams/1353/60.png https://cdn.sofifa.net/teams/1354/60.png
nation_flag_url https://cdn.sofifa.net/flags/ar.png https://cdn.sofifa.net/flags/pl.png https://cdn.sofifa.net/flags/pt.png
work_rate_att Medium High High
work_rate_dff Low Medium Low
#Acrobat 1 0 1
#Clinical Finisher 1 1 1
#Complete Forward 1 1 1
#Distance Shooter 1 1 1
#Dribbler 1 0 1
#FK Specialist 1 0 0
#Aerial Threat 0 1 1
#Crosser 0 0 1
#Complete Midfielder 0 0 0
#Playmaker 0 0 0
#Speedster 0 0 0
#Engine 0 0 0
#No_Tags 0 0 0
#Tackling 0 0 0
#Tactician  0 0 0
#Poacher 0 0 0
#Complete Defender 0 0 0
#Strength 0 0 0
#Tactician 0 0 0
#Tackling  0 0 0
#Playmaker  0 0 0
Chip Shot 1 1 0
Finesse Shot 1 1 0
Long Shot Taker 1 0 1
One Club Player 1 0 0
Outside Foot Shot 1 1 1
Playmaker 1 0 0
Technical Dribbler 1 0 0
Solid Player 0 1 0
Flair 0 0 1
Power Free-Kick 0 0 1
Speed Dribbler 0 0 1
Injury Prone 0 0 0
Early Crosser 0 0 0
Leadership 0 0 0
Long Passer 0 0 0
Comes For Crosses 0 0 0
GK Long Throw 0 0 0
Rushes Out Of Goal 0 0 0
Saves with Feet 0 0 0
Team Player 0 0 0
Dives Into Tackles 0 0 0
Power Header 0 0 0
Cautious With Crosses 0 0 0
Long Throw-in 0 0 0
No Traits 0 0 0
Giant Throw-in 0 0 0

one player can plays multiple position. However, as we can see the first postion in player_postion is there main position. Therefore, we are going to make it their only position.

df$player_positions <- sapply(strsplit(df$player_positions, ", "), `[`, 1)

defence <- c("CB", "RB", "LB", "LWB", "RWB", "LCB", "RCB")
midfielder <- c("CM", "CDM","CAM","LM","RM", "LAM", "RAM", "LCM", "RCM", "LDM", "RDM")

df %<>% mutate(Class = if_else(player_positions %in% "GK", "GK",
                                 if_else(player_positions %in% defence, "DEF",
                                         if_else(player_positions %in% midfielder, "MID", "FWD"))))

rm(defence, midfielder)

Now we finished cleaning our data and can begins ours analysis.

# save df into .Rdata format
save(df, file = "../output/fifa_22_tidydata_cleaned.Rdata")
list.files(path = "../output")
## [1] "FIFA_22_Analysis_files"         "FIFA_22_Analysis.html"         
## [3] "fifa_22_tidydata_cleaned.Rdata" "fifa_22_tidydata_raw.Rdata"

EDA

load("../output/fifa_22_tidydata_cleaned.Rdata")

df %>% 
  group_by(nationality_name) %>% 
  summarise(n_player = n()) %>% 
  arrange(desc(n_player)) %$%
  kable((head(.,10)), "html") %>%
  kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>%
  scroll_box(width = "100%")
nationality_name n_player
England 1719
Germany 1214
Spain 1086
France 980
Argentina 960
Brazil 897
Japan 546
Netherlands 439
United States 413
Poland 403
options(repr.plot.width = 12, repr.plot.heigh = 8)

world_map <- map_data("world")

numofplayers <- world_map %>% 
  mutate(region = as.character(region)) %>% 
  left_join((df %>% mutate(Nationality = as.character(nationality_name),
                           Nationality = if_else(nationality_name %in% "England", 
                                                 "UK", nationality_name)) %>%
               count(Nationality, name = "Number of Player") %>%
               rename(region = Nationality) %>%
               mutate(region = as.character(region))), by = "region")

ggplotly(ggplot(numofplayers, aes(long, lat, group = group))+
    geom_polygon(aes(fill = `Number of Player` ), color = "white", show.legend = FALSE)+
    scale_fill_viridis_c(option = "C")+
    theme_fivethirtyeight()+
    labs(fill = "Number of Player",
         title = "Number of Player From Around the World"))

There are 163 countries in the database and European Countries have most player. The England has the most player in the game with 1719 players, following with Germany and Spain with 1214 and 1086 players.

n_clubs <- length(unique(df$club_team_id))
paste0("Total number of clubs :", n_clubs)
## [1] "Total number of clubs :702"
max_pa <- df %>% 
  arrange(desc(potential)) %>% 
  slice(1:1) 

max_ca <- df %>% 
  arrange(desc(overall)) %>% 
  slice(1:1) 

cat("Best Player\n--------------------------------------------\n",paste0("Maximum Potentia :", max_pa$short_name),"\n",paste0("Maximum Overall Perforamnce :", max_ca$short_name))
## Best Player
## --------------------------------------------
##  Maximum Potentia :K. Mbappé 
##  Maximum Overall Perforamnce :L. Messi
atts_list <- c("pace", "shooting", "passing", "dribbling", "defending", "physic", "attacking_crossing", "attacking_finishing", "attacking_heading_accuracy", "attacking_short_passing", "attacking_volleys", "skill_dribbling", "skill_curve", "skill_fk_accuracy",  "skill_long_passing", "skill_ball_control", "movement_acceleration", "movement_sprint_speed", "movement_agility", "movement_reactions", "movement_balance", "power_shot_power", "power_jumping", "power_stamina", "power_strength", "power_long_shots", "mentality_aggression", "mentality_interceptions", "mentality_positioning", "mentality_vision", "mentality_penalties","mentality_composure", "defending_marking_awareness", "defending_standing_tackle", "defending_sliding_tackle", "goalkeeping_diving", "goalkeeping_handling", "goalkeeping_kicking", "goalkeeping_positioning", "goalkeeping_reflexes", "goalkeeping_speed")

atts <- df %>% 
  select(short_name, all_of(atts_list)) 
  
bestof = melt(atts, id.vars = "short_name", variable.name = "Attribute")
bestof = bestof[, .SD[which.max(value)], by= Attribute][, c("Attribute", "short_name")]

kable(bestof, "html") %>%
  kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>% 
  scroll_box(width = "100%")
Attribute short_name
pace K. Mbappé
shooting Cristiano Ronaldo
passing K. De Bruyne
dribbling L. Messi
defending V. van Dijk
physic Casemiro
attacking_crossing K. De Bruyne
attacking_finishing L. Messi
attacking_heading_accuracy L. de Jong
attacking_short_passing K. De Bruyne
attacking_volleys L. Suárez
skill_dribbling L. Messi
skill_curve Quaresma
skill_fk_accuracy L. Messi
skill_long_passing K. De Bruyne
skill_ball_control L. Messi
movement_acceleration K. Mbappé
movement_sprint_speed K. Mbappé
movement_agility Neymar Jr
movement_reactions L. Messi
movement_balance R. Fraser
power_shot_power A. Kolarov
power_jumping Cristiano Ronaldo
power_stamina N. Kanté
power_strength A. Akinfenwa
power_long_shots L. Messi
mentality_aggression B. Pearson
mentality_interceptions N. Kanté
mentality_positioning T. Müller
mentality_vision L. Messi
mentality_penalties Neymar Jr
mentality_composure L. Messi
defending_marking_awareness G. Chiellini
defending_standing_tackle N. Kanté
defending_sliding_tackle A. Wan-Bissaka
goalkeeping_diving G. Donnarumma
goalkeeping_handling J. Oblak
goalkeeping_kicking Ederson
goalkeeping_positioning S. Handanovič
goalkeeping_reflexes J. Oblak
goalkeeping_speed Jordi Masip
max_value <- df %>%
  arrange(desc(value_eur)) %>% 
  slice(1:1)

max_wage <- df %>% 
  arrange(desc(wage_eur)) %>% 
  slice(1:1)
cat("Top Earners\n----------------------------------\n", paste0("Maximum Value :", max_value$short_name), "\n", paste0("Maximum Wages :", max_wage$short_name))
## Top Earners
## ----------------------------------
##  Maximum Value :K. Mbappé 
##  Maximum Wages :K. De Bruyne
foots <- df %>% 
  group_by(preferred_foot) %>% 
  summarise(n_player = n()) 

foots$p_player <- round(100 * (foots$n_player / sum(foots$n_player)))


ggplot(foots, aes(fill=preferred_foot, values=p_player))+
  geom_waffle(n_rows= 10, colour= "White") +
  scale_fill_manual(name= NULL,
                    values = c("grey50", "grey"),
                    labels= c("Left", "Right")) +
  ggtitle("Left vs Right foot", subtitle = "There are 24 lefty out of every 100 players") +
  coord_equal() +
  theme_fivethirtyeight() +
  theme(panel.grid.major = element_blank(),
        axis.text.y = element_blank(), 
        axis.text.x = element_blank())

Player Overall

Player rating are normally distributed in FIFA 22, with a mean of 65.77 and standard deviation of 6.88.

df %>% 
  ggplot(aes(x = overall)) +
  geom_histogram(color= "white", fill="darkgrey") +
  ggtitle("Player Ratings Are Normally Distributed", subtitle = "The mean can be used as a measure of central tendancy") +
  theme_fivethirtyeight() +
  theme(axis.text.y = element_blank())

Now we can check the distibution of each leage.

top_league <- df %>% 
  filter(league_level == 1) %>% 
  group_by(league_name) %>% 
  summarise(avg_rating = mean(overall)) %>% 
  filter(avg_rating >= 70) %>% 
  select(league_name)

summ <- df %>% 
  filter(league_name %in% top_league$league_name) %>% 
  group_by(league_name) %>% 
  summarise(avg_rating = mean(overall),
            n_player = n(),
            avg_age = mean(age))

top_league_player <- df %>% 
  filter(league_name %in% top_league$league_name)

options(repr.plot.width = 12, repr.plot.heigh = 8)

ggplotly(ggplot() + 
  geom_histogram(top_league_player, mapping = aes(overall, fill= league_name), color= "white", fill="darkgrey") +
  geom_vline(summ, mapping = aes(xintercept = avg_rating), size = 0.5, color="red") +
  geom_text(summ, mapping = aes(x = avg_rating-5, y= 100, label= round(avg_rating, digits = 2))) +
  facet_wrap(league_name~.) +
  theme_fivethirtyeight() +
  theme(legend.position = "bottom", axis.text.y = element_blank()) +
  labs(y = element_blank(), x= "Overall Rating" , title = "Rating Distribution of Player in Top Leagues", subtitle = "Overall rating distribution and average overall rating of all top leagues"))

The average overall rating of each league is closely related with how skilled player in the league, the higher the average overall rating - the higher the skill. Apparently, player in Campeonato Brasileiro Série A, Czech Republic Gambrinus Liga, and Ukrainian Premier League from Brazil, Czech, and Ukrain are as skilled as player in German 1. Budesliga or French Ligue 1 from German and France and player in Spain Primera Division from Spain is the most skilled one.

Age vs Overall Rating

df %>% 
  filter(!player_positions == "GK") %>% 
  group_by(age) %>% 
  summarise(Potential = mean(potential),
            Overall = mean(overall)) %>% 
  ggplot(aes(x = age)) +
  geom_line(aes(y=Overall), color= "grey50", size=1)+
  ggtitle("Average rating is flaten over the years", subtitle = "Player ratings tend not to get better after the age of 30")+
  theme_fivethirtyeight()

As we can see the average overall rating is flaten over the ages and drop sharply after 40.

df %>% 
  filter(!Class == "GK") %>% 
  group_by(Class, age) %>% 
  summarise(Rating = mean(overall)) %>% 
  ggplot(aes(x= age, y= Rating, group= Class)) +
  geom_line(size = 1, color= "grey50") +
  xlim(15, 45) +
  ggtitle("Rating over the ages by position class") +
  theme_fivethirtyeight() +
  facet_wrap(~ Class, ncol=1) +
  theme(strip.background = element_rect(fill= "darkgrey"), strip.text = element_text(color= "white", face= "bold"))

With this relationship is explored by the major position groups, we can see that defender ratings tend to beging their decline earliest at around 33 years of age, while the decline starts somewhere closer to 35 for both attackers and midfielders.

When player are reaching their potential

df %>% 
  group_by(age) %>% 
  summarise(Potential = mean(potential),
            Overall = mean(overall)) %>% 
  ggplot(aes(x = age)) +
  geom_line(aes(y=Potential), color= "purple", size=1, linetype= "dashed")+
  geom_line(aes(y=Overall), color= "grey50", size=1)+
  annotate("text", x= 30, y=73, label="Potential meets overall\ntalent at 29 years old", color= "grey50") +
  ggtitle("Potential And Overall Talent Converges", subtitle = "The average ratings were taken for each age")+
  theme_fivethirtyeight()

It appear that the player potential and player over all converge around when they are 29.

gk_vars <- df %>% select(contains("goalkeeping")) %>% names()

spearman_cor_overall <- df %>% 
  filter(player_positions != "GK") %>% 
  select_if(is.numeric) %>% 
  select(-club_team_id, -league_level, -club_jersey_number, -club_contract_valid_until, -nationality_id, -nation_team_id, -nation_jersey_number, -all_of(gk_vars)) %>% 
  as.matrix() %>% 
  na.omit() %>% 
  cor(method = "spearman")

pearson_cor_overall <- df %>% 
  filter(player_positions != "GK") %>% 
  select_if(is.numeric) %>% 
  select(-club_team_id, -league_level, -club_jersey_number, -club_contract_valid_until, -nationality_id, -nation_team_id, -nation_jersey_number, -all_of(gk_vars)) %>% 
  as.matrix() %>% 
  na.omit() %>% 
  cor()

cor_colnames <- colnames(spearman_cor_overall)

spearman_cor_overall <- spearman_cor_overall[,2] %>% data.frame()

spearman_cor_overall <- cbind(cor_colnames, spearman_cor_overall) %>% arrange(desc(`.`))

pearson_cor_overall <- pearson_cor_overall[,2] %>% data.frame()

pearson_cor_overall <- cbind(cor_colnames, pearson_cor_overall) %>% arrange(desc(`.`))

spearman_cor_overall %>% left_join(pearson_cor_overall, by="cor_colnames") %>% rename(Feature = cor_colnames, Spearman = `..x`, Pearson = `..y`) %>% filter(Feature != "overall") %>% head(10) %$%
  kable((head(.,10)), "html") %>%
  kable_styling(bootstrap_options = c("striped","hover", "condensed"), font_size = 8) %>%
  scroll_box(width = "100%")
Feature Spearman Pearson
value_eur 0.8815896 0.5627151
movement_reactions 0.8721298 0.8755071
mentality_composure 0.7967710 0.8103614
attacking_short_passing 0.7694905 0.7799224
skill_ball_control 0.7675370 0.7636762
wage_eur 0.7624437 0.6056264
release_clause_eur 0.7228825 0.5366370
passing 0.6959918 0.7150010
dribbling 0.6790794 0.6664023
skill_dribbling 0.6149346 0.5723942

These are 10 highly correalated attrs with overall.

Which postion are skilled in what?

tile_data <- df %>% 
  select_if(is.numeric) %>% 
  select(-all_of(gk_vars)) %>% 
  left_join(df %>% select(sofifa_id, player_positions, Class), by= "sofifa_id") %>% 
  select(starts_with("attacking_"), starts_with("skill_"), starts_with("movement_"), starts_with("power_"), starts_with("mentality_"), starts_with("defending_"), player_positions, Class) %>% select(-skill_moves) %>% 
  rename_all(~stringr::str_replace_all(.,"^attacking_","")) %>%
  rename_all(~stringr::str_replace_all(.,"^skill_","")) %>%
  rename_all(~stringr::str_replace_all(.,"^movement_","")) %>%
  rename_all(~stringr::str_replace_all(.,"^power_","")) %>% 
  rename_all(~stringr::str_replace_all(.,"^mentality_",""))  %>% 
  rename_all(~stringr::str_replace_all(.,"^defending_",""))

tile_data %<>% filter(player_positions != "GK") %>% 
  gather(key= Attribute, value= Value, -player_positions, -Class) %>% 
  group_by(Class, player_positions, Attribute) %>% 
  summarise(MedianValue = median(Value, na.rm = T)) %>% 
  ggplot(aes(x= Attribute, y= player_positions)) +
  geom_tile(aes(fill = MedianValue), colour= "black") +
  geom_text(aes(label = MedianValue)) + 
  scale_fill_gradient(low= "purple", high= "green") +
  ggtitle("Defender are strong, Foward are agile", subtitle = "Analysing the median ratings for each of the attributes for each postion for player with and overall rating over 75") +
  theme_fivethirtyeight() +
  theme(axis.text.x = element_text(angle = 45, hjust= 1), strip.text = element_text(face= "bold", size=12), legend.position = "none") +
  facet_wrap(~ Class, scales = "free", ncol= 1)

Team Overall Talent

Top_20_clubs <- df %>% 
  group_by(club_name) %>% 
  summarise(AverageRating = mean(overall, na.rm= T)) %>% 
  arrange(desc(AverageRating)) %>% 
  head(n=20) %>% 
  pull(club_name)

df %>% 
  filter(club_name %in% Top_20_clubs) %>% 
  mutate(Top3 = ifelse(club_name %in% c("Juventus", "Paris Saint-Germain", "Inter"), "Yes", "No")) %>% 
  ggplot(aes(x= reorder(club_name, overall), y=overall, fill= Top3)) +
  geom_boxplot(color= "black") +
  scale_fill_manual(values = c("lightgrey", "purple")) +
  ggtitle("Juventus has the Hightest Overall", subtitle = "The average overall rating of the 20 highest rated teams in the game, sorted in decending order") +
  coord_flip() +
  theme_fivethirtyeight() +
  theme(legend.position = "none")